ChIP AML PiPeline v2

In [662]:
import os
import pandas as pd
import sys
sys.path.insert(0, '../..')
import itertools
from scipy import stats
import numpy as np

from JKBio.epigenetics import chipseq as chip
from JKBio.utils import helper
from JKBio.utils import plot
import igv
import SimpSOM as sps
from scipy import stats

import seaborn as sns
from matplotlib import cm
from matplotlib import pyplot as plt
from IPython.display import IFrame
import seaborn as sns
from bokeh.plotting import *
import igv

import numba
from numba import jit

from scipy.cluster.hierarchy import linkage, leaves_list
from sklearn.cluster import AgglomerativeClustering, DBSCAN, KMeans, OPTICS
from sklearn.mixture import GaussianMixture
from sklearn.manifold import MDS, TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from IPython.display import IFrame

from statsmodels.stats.multitest import multipletests

from pybedtools import BedTool
import pyBigWig

output_notebook()
%load_ext autoreload
%autoreload 2
Loading BokehJS ...
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
In [930]:
#setting basic parameters
project="Cobinding_ChIP"
version="v3"
merging_version="remove_double"
window="150"
In [931]:
merged = pd.read_csv('../results/'+project+'/merged_'+version+'_'+merging_version+'_'+window+'_with_annotations.bed.gz', sep='\t')
In [764]:
#retrieving the data from the previous notebook
%store -r merged
%store -r chrombed
%store -r mergedpeak
%store -r cols
%store -r annot
%store -r version
%store -r merging_version
%store -r window
%store -r crc
In [765]:
merging_version
Out[765]:
'remove_triple'

Correlation with Annotations

non binarized

In [820]:
#data = pd.DataFrame(np.corrcoef(stats.zscore(0.01+merged[merged.columns[cols:]].T)), columns=merged.columns[cols:], index = merged.columns[cols:])
data = pd.DataFrame(np.corrcoef(merged[merged.columns[cols:]].T.astype(bool)), columns=merged.columns[cols:], index = merged.columns[cols:])
link = linkage(data[:annot-cols])
col = data.iloc[annot-cols:]
col = col[[co for co in col.columns if co not in col.index.tolist()]]
rdb = cm.get_cmap('RdBu_r', 256)
for val in col.columns:
    a = [rdb(128+int(v*128)) for v in col[val]]
    col[val] = a

Correlation: hierarchical clustering

A hierarchically clustered heatmap representing the correlation coefficient of each CRC TF with each other. Above is displayed the correlation of non CRC proteins/marks with CRC TFs.

In [821]:
#correlation_withannotation
fig = sns.clustermap(data.iloc[:annot][data.columns[leaves_list(link)]], vmin=-1,vmax=1, col_colors=col.T, figsize=(10,20), cmap='RdBu_r', row_linkage=link, col_cluster=False, colors_ratio=0.01)
fig.ax_col_dendrogram.set_visible(False)
fig.fig.suptitle("correlation_withannotation")
#fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_correlation_with_annotation.pdf')
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_boolcorrelation_with_annotation.pdf')

plt.show()

binarized

similar plot computed on binarized signal (whether a peak is present at a location or not)

In [822]:
#data = pd.DataFrame(np.corrcoef(stats.zscore(0.01+merged[merged.columns[cols:]].T)), columns=merged.columns[cols:], index = merged.columns[cols:])
data = pd.DataFrame(np.corrcoef(merged[merged.columns[cols:]].T.astype(bool)), columns=merged.columns[cols:], index = merged.columns[cols:])
link = linkage(data[:annot-cols])
col = data.iloc[annot-cols:]
col = col[[co for co in col.columns if co not in col.index.tolist()]]
rdb = cm.get_cmap('RdBu_r', 256)
for val in col.columns:
    a = [rdb(128+int(v*128)) for v in col[val]]
    col[val] = a
In [823]:
#correlation_withannotation
fig = sns.clustermap(data.iloc[:annot][data.columns[leaves_list(link)]], vmin=-1,vmax=1, col_colors=col.T, figsize=(10,15), cmap='RdBu_r', row_linkage=link, col_cluster=False, colors_ratio=0.008)
fig.ax_col_dendrogram.set_visible(False)
fig.fig.suptitle("correlation_onbinarized_signal_withannotation")
#fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_correlation_with_annotation.pdf')
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_boolcorrelation_onbinarized_signal_with_annotation.pdf')

plt.show()

Looking at peak overlap

How many of peak in A (column) overlaps with peak in B (rows)

in other words:

what is the percentage of B's peaks that are overlaped by A's peaks

In [914]:
overlap, correlation = chip.pairwiseOverlap(merged, norm=True)
enrichment, pvals = chip.enrichment(merged)
we will be correcting for fully similar lines/ columns by removing 1 on their last value
/home/jeremie/.local/lib/python3.8/site-packages/scipy/stats/stats.py:2419: RuntimeWarning: invalid value encountered in true_divide
  return (a - mns) / sstd
/home/jeremie/.local/lib/python3.8/site-packages/numpy/lib/function_base.py:2534: RuntimeWarning: invalid value encountered in true_divide
  c /= stddev[:, None]
/home/jeremie/.local/lib/python3.8/site-packages/numpy/lib/function_base.py:2535: RuntimeWarning: invalid value encountered in true_divide
  c /= stddev[None, :]
/home/jeremie/.local/lib/python3.8/site-packages/numpy/lib/function_base.py:2526: RuntimeWarning: Degrees of freedom <= 0 for slice
  c = cov(x, y, rowvar)
/home/jeremie/.local/lib/python3.8/site-packages/numpy/lib/function_base.py:2455: RuntimeWarning: divide by zero encountered in true_divide
  c *= np.true_divide(1, fact)
/home/jeremie/.local/lib/python3.8/site-packages/numpy/lib/function_base.py:2455: RuntimeWarning: invalid value encountered in multiply
  c *= np.true_divide(1, fact)
/home/jeremie/.local/lib/python3.8/site-packages/scipy/stats/stats.py:2416: RuntimeWarning: Mean of empty slice.
  mns = a.mean(axis=axis, keepdims=True)
/home/jeremie/.local/lib/python3.8/site-packages/numpy/core/_methods.py:153: RuntimeWarning: invalid value encountered in true_divide
  ret = um.true_divide(
/home/jeremie/.local/lib/python3.8/site-packages/numpy/core/_methods.py:216: RuntimeWarning: Degrees of freedom <= 0 for slice
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
/home/jeremie/.local/lib/python3.8/site-packages/numpy/core/_methods.py:185: RuntimeWarning: invalid value encountered in true_divide
  arrmean = um.true_divide(
/home/jeremie/.local/lib/python3.8/site-packages/numpy/core/_methods.py:206: RuntimeWarning: invalid value encountered in true_divide
  ret = um.true_divide(
/home/jeremie/.local/lib/python3.8/site-packages/numpy/lib/function_base.py:393: RuntimeWarning: Mean of empty slice.
  avg = a.mean(axis)
../../JKBio/epigenetics/chipseq.py:1125: RuntimeWarning: divide by zero encountered in log2
  enrichment[i,j+add] = np.log2(e)
In [825]:
#saving the enrichments
overlap.to_csv('../results/'+project+'/'+version+'_'+merging_version+'_'+window+'pairwise_overlap.csv')
correlation.to_csv('../results/'+project+'/'+version+'_'+merging_version+'_'+window+'pairwise_correlation_on_overlap.csv')
enrichment.to_csv('../results/'+project+'/'+version+'_'+merging_version+'_'+window+'pairwise_enrichment_on_overlap.csv')
pvals.to_csv('../results/'+project+'/'+version+'_'+merging_version+'_'+window+'pairwise_enrichment_on_overlap_pvals.csv')
In [935]:
overlap = pd.read_csv('../results/'+project+'/'+version+'_'+merging_version+'_'+window+'pairwise_overlap.csv', index_col=0)
correlation = pd.read_csv('../results/'+project+'/'+version+'_'+merging_version+'_'+window+'pairwise_correlation_on_overlap.csv', index_col=0)
enrichment = pd.read_csv('../results/'+project+'/'+version+'_'+merging_version+'_'+window+'pairwise_enrichment_on_overlap.csv', index_col=0)
pvals = pd.read_csv('../results/'+project+'/'+version+'_'+merging_version+'_'+window+'pairwise_enrichment_on_overlap_pvals.csv', index_col=0)

pairwise overlap

Overlap of COL proteins peaks in ROW protein peaks. A hierarchically clustered heatmap where 1 represent 100% overlap and 0, 0%. (e.g. E2F3 binds almost always (>80% of the time) under SREBF1. But SREBF1 only does so ~10% of the time.) The bottom-left side square are the CRC (clustered). At the top and right side has been added additional signals of histone marks/state and non CRC proteins (non clustered). The plot is non symmetrical as a protein can bind in many more places than another and thus cover all its binding regions while binding in many other places.

In [937]:
link = linkage(overlap.iloc[:annot-cols]) # D being the measurement
c = overlap[[co for co in overlap.columns if co not in overlap.iloc[:annot-cols].index.tolist()]]
viridis = cm.get_cmap('viridis', 256)
for val in c.columns:
    a = [viridis(int(v*255)) for v in c[val]]
    c[val] = a
<ipython-input-937-19a8ad627eb5>:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  c[val] = a
In [938]:
fig = sns.clustermap(overlap.iloc[:annot-cols][overlap.columns[np.concatenate((leaves_list(link), range(annot, len(overlap.columns))))]], row_linkage=link, col_colors=c,figsize=(20,18), colors_ratio=0.01, col_cluster=False)
fig.ax_col_dendrogram.set_visible(False)
fig.fig.suptitle("pairwise_overlap_clustermap")
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'pairwise_overlap_clustermap.pdf')
plt.show()

Correlation on overlaps

Here we display the correlation on binding signal, only on regions where the two proteins/signals overlap. It allows us to recover signal that would be hidden for sets of TF that would only co-bind in few places.

In [828]:
link = linkage(correlation.iloc[:annot-cols], optimal_ordering=True) # D being the measurement
c = correlation[[co for co in correlation.columns if co not in correlation.iloc[:annot-cols].index.tolist()]]
viridis = cm.get_cmap('viridis', 256)
for val in c.columns:
    a = [rdb(128+int(v*128)) for v in c[val]]
    c[val] = a
<ipython-input-828-63360ec74203>:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  c[val] = a
In [829]:
fig = sns.clustermap(correlation.iloc[:annot-cols][correlation.columns[np.concatenate((leaves_list(link), range(annot, len(correlation.columns))))]], row_linkage=link, col_colors=c, colors_ratio=0.01, figsize=(20,18), vmin=-1, vmax=1, cmap='RdBu_r', col_cluster=False)
fig.ax_col_dendrogram.set_visible(False)
fig.fig.suptitle("correlation_on_pairwise_overlap_clustermap")
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_correlation_onoverlap.pdf')
plt.show()

Enrichments on the cobinding matrix

Log-enrichments given for each TF and chromatin mark over each other. Enrichment is computed by doing a fisher's exact test on the expected vs observed presence of COLUMN protein peaks under ROW protein peaks. Pvalues are then corrected for multiple hypothesis testing using the BH method. (e.g. here CTCF is very strongly enriched in SMC1 but SMC1 is not as much)

In [950]:
link = linkage(enrichment, optimal_ordering=True)
fig = sns.clustermap(enrichment,figsize=(20,20), vmin=-5,vmax=5, cmap='RdBu_r', col_linkage=link, row_linkage=link)
fig.ax_col_dendrogram.set_visible(False)
fig.ax_row_dendrogram.set_visible(False)
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_enrichment_clustermap_all_to_all.pdf')
plt.show()

interactive plot

(see above for information) this plot shows the same enrichments as above, except that it shows a decreasing square size for any pvalue above $10^{-9}$. hovering shows enrichment (value) and pvalue (pval) for each protein/mark pair

In [832]:
link = linkage(enrichment, optimal_ordering=True)
plot.correlationMatrix(enrichment.loc[enrichment.columns[leaves_list(link)]][enrichment.columns[leaves_list(link)]].values, dataIsCorr=True, names=enrichment.columns[leaves_list(link)].tolist(), pvals=pvals[enrichment.columns[leaves_list(link)]].loc[enrichment.columns[leaves_list(link)]].values, size=12, title= version + '_' + merging_version + '_' + window + 'enrichment', folder='../results/'+project+'/plots/',interactive=True,minval=-4,maxval=4)
we are assuming you want to display the pvals of your correlation with size
12.808227467613827
/home/jeremie/.local/lib/python3.8/site-packages/bokeh/io/saving.py:125: UserWarning: save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN
  warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
/home/jeremie/.local/lib/python3.8/site-packages/bokeh/io/saving.py:138: UserWarning: save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'
  warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")
Out[832]:
Figure(
id = '14646', …)

looking only at crcs

In [837]:
enrichment = enrichment[crc&set(enrichment.columns)].loc[crc&set(enrichment.columns)]
In [838]:
link = linkage(enrichment, optimal_ordering=True)
plot.correlationMatrix(enrichment.loc[enrichment.columns[leaves_list(link)]][enrichment.columns[leaves_list(link)]].values, dataIsCorr=True, names=enrichment.columns[leaves_list(link)].tolist(), pvals=pvals[enrichment.columns[leaves_list(link)]].loc[enrichment.columns[leaves_list(link)]].values, size=12, title= version + '_' + merging_version + '_' + window + 'enrichment', folder='../results/'+project+'/plots/',interactive=True,minval=-4,maxval=4)
we are assuming you want to display the pvals of your correlation with size
5.218626658092977
/home/jeremie/.local/lib/python3.8/site-packages/bokeh/io/saving.py:125: UserWarning: save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN
  warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
/home/jeremie/.local/lib/python3.8/site-packages/bokeh/io/saving.py:138: UserWarning: save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'
  warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")
Out[838]:
Figure(
id = '15166', …)

clustering

I have tried gaussian mixtures and Agglomerative clustering algorithm. Only the second can create a hierarchical clustering.

It seems that gaussian mixture makes more sense given the data we have, for now, is more "homogeneous".

I am still not so happy with the clustering. It can be because of the how much importance, outlier values and the high number of noisy values from locations with no peaks.

We can use similar methods to RNAseq to improve this (clamping values, log transform, first round of PCA..)

using KMeans

In [844]:
data= merged[merged.columns[cols:annot]].values
# using score
#scaled_data = stats.zscore(data)
# using regular scaling
#scaled_data = (data-data.min(0))/(data.max(0)-data.min(0))
#binary scaling
scaled_data = data.astype(bool)
In [845]:
scaled_data.max(0)
Out[845]:
array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])
In [846]:
#https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html#sklearn.cluster.KMeans
n = 2
n_clust=[10,20,50,100,300]
kmean = KMeans(n_clusters=n_clust[n],n_jobs=8)
groups = kmean.fit_predict(scaled_data)
centroid = kmean.cluster_centers_

plotting center location of clusters (kind of enrichment)

Heatmap of the centroid location of each cluster. centroids represent the center of mass of a cluster in kmean clustering. Here a cluster having a value of 1 means that the center of mass is completely shifted toward that particular TF. It showcase in what type of enhancers the cluster is clustering on (archetypical cobindings). This is very sensitive to the number of clusters and scaling applied

In [847]:
df = pd.DataFrame(centroid,columns=merged.columns[cols:annot]).T
#df = df/df.max()
fig = sns.clustermap(df, vmax=1)
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_kmeans_'+str(n_clust[n])+'_centroids.pdf')

showing the cobinding matrix with the clusters

the cobinding matrix, clustered using the kmeans clusters. (see cobinding matrix description)

In [848]:
rand = np.random.choice(merged.index,5000)

subgroups = groups[rand]
sorting = np.argsort(subgroups)
rainb = cm.get_cmap('gist_rainbow', len(set(groups)))
colors = [rainb(i) for i in subgroups[sorting]]

viridis = cm.get_cmap('viridis', 256)
data = merged[merged.columns[annot:]]
for val in data.columns:
    data[val]=np.log2(1+data[val])
data = (data-data.min())/(data.max()-data.min())
data = data.iloc[rand].iloc[sorting]
for val in data.columns:
    a = [viridis(int(v*256)) for v in data[val]]
    data[val] = a
data["clusters"]  = colors
<ipython-input-848-770781c76b2b>:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[val]=np.log2(1+data[val])
In [850]:
fig = sns.clustermap(np.log2(1.01+merged[merged.columns[cols:annot]].iloc[rand].iloc[sorting].T), vmin=0, vmax=3, figsize=(20,20), z_score=0, colors_ratio=0.01, col_cluster=False,col_colors=data, xticklabels=False)
fig.ax_col_dendrogram.set_visible(False)
fig.fig.suptitle("sorted clustermap of cobindings clustered with Kmeans")
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_kmeans_'+str(n_clust[n])+'_onbool_clustermap_cobinding.pdf')
plt.show()

Computing enrichment on the clusters

enrichment of each TF/chromatin marks under each cluster (see previous enrichment plot for description)

In [851]:
enr, pvals = chip.enrichment(merged, groups=groups)
../../JKBio/epigenetics/chipseq.py:1168: RuntimeWarning: divide by zero encountered in log2
  groups), columns=bedfile.columns[bedcol:])
In [853]:
# enrichment for each cluster
fig = sns.clustermap(enr.T,figsize=(14,20), col_cluster=0, vmax=4, vmin=-4, cmap='RdBu_r')
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_kmeans_'+str(n_clust[n])+'_enrichment_on_cluster_frombool_metasubset.pdf')
plt.show()

using bokeh

pvalue will be informed by opacity

In [749]:
fig = plot.correlationMatrix(enr.loc[enr.columns[leaves_list(link)]][enr.columns[leaves_list(link)]].values, maxokpval=10**-9,  pvals= pvals[enr.columns[leaves_list(link)]].loc[enr.columns[leaves_list(link)]], names=enr.columns[leaves_list(link)].tolist(), size=12, title= version + '_' + merging_version + '_' + window + 'enrichment', folder='../results/'+project+'/plots/', interactive=True, minval=-4, maxval=4, dataIsCorr=True)
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-749-6ac8f1585189> in <module>
----> 1 fig = plot.correlationMatrix(enr.loc[enr.columns[leaves_list(link)]][enr.columns[leaves_list(link)]].values, maxokpval=10**-9,  pvals= pvals[enr.columns[leaves_list(link)]].loc[enr.columns[leaves_list(link)]], names=enr.columns[leaves_list(link)].tolist(), size=12, title= version + '_' + merging_version + '_' + window + 'enrichment', folder='../results/'+project+'/plots/', interactive=True, minval=-4, maxval=4, dataIsCorr=True)

~/.local/lib/python3.8/site-packages/pandas/core/indexing.py in __getitem__(self, key)
   1766 
   1767             maybe_callable = com.apply_if_callable(key, self.obj)
-> 1768             return self._getitem_axis(maybe_callable, axis=axis)
   1769 
   1770     def _is_scalar_access(self, key: Tuple):

~/.local/lib/python3.8/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
   1952                     raise ValueError("Cannot index with multidimensional key")
   1953 
-> 1954                 return self._getitem_iterable(key, axis=axis)
   1955 
   1956             # nested tuple slicing

~/.local/lib/python3.8/site-packages/pandas/core/indexing.py in _getitem_iterable(self, key, axis)
   1593         else:
   1594             # A collection of keys
-> 1595             keyarr, indexer = self._get_listlike_indexer(key, axis, raise_missing=False)
   1596             return self.obj._reindex_with_indexers(
   1597                 {axis: [keyarr, indexer]}, copy=True, allow_dups=True

~/.local/lib/python3.8/site-packages/pandas/core/indexing.py in _get_listlike_indexer(self, key, axis, raise_missing)
   1550             keyarr, indexer, new_indexer = ax._reindex_non_unique(keyarr)
   1551 
-> 1552         self._validate_read_indexer(
   1553             keyarr, indexer, o._get_axis_number(axis), raise_missing=raise_missing
   1554         )

~/.local/lib/python3.8/site-packages/pandas/core/indexing.py in _validate_read_indexer(self, key, indexer, axis, raise_missing)
   1638             if missing == len(indexer):
   1639                 axis_name = self.obj._get_axis_name(axis)
-> 1640                 raise KeyError(f"None of [{key}] are in the [{axis_name}]")
   1641 
   1642             # We (temporarily) allow for some missing keys with .loc, except in

KeyError: "None of [Index(['FLAG_MEF2D', 'RUNX2', 'FLI1', 'SREBF1', 'LMO2', 'SP1', 'ZNF281',\n       'SPI1', 'GATA2', 'IRF8', 'ZMYND8', 'MYC', 'MYB', 'E2F3', 'RXRA',\n       'HOXA9', 'TFAP4', 'STAT5B', 'CEBPA', 'MEIS1', 'RUNX1', 'LYL1', 'ZEB2',\n       'FLAG_GFI1', 'ETV6', 'FOSL2', 'PLAGL2', 'MEF2C', 'MAX'],\n      dtype='object')] are in the [index]"

showing enrichment over cobinding

clustered cobinding matrix where signal for non CRC peaks has been replaced by enrichment of those over the clusters and where signal for CRC peaks has been averaged over the clusters

In [854]:
plot.andrew(groups, merged, annot, cols=8, precise=False, folder = '../results/' + project + '/plots/' + version + '_' + merging_version + '_' + window + '_kmeans_', title = "sorted clustermap of cobindings clustered with Kmeans")
../../JKBio/epigenetics/chipseq.py:1168: RuntimeWarning: divide by zero encountered in log2
  groups), columns=bedfile.columns[bedcol:])
../../JKBio/utils/plot.py:626: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  redblue = cm.get_cmap('RdBu_r',256)
/home/jeremie/.local/lib/python3.8/site-packages/pandas/core/frame.py:2986: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(-key, value, inplace=True)
../../JKBio/utils/plot.py:627: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subenr = enr[enr.columns[annot-cols:]]

Using SOMs

Kohonen Self-Organizing Maps. Each nodes learn to recognize a pattern in the dataset, unregarding of its distribution. Each node tries to be as similar as possible to its close neighboor and dissimilar to nodes further away. This allows to reduce dimensionality of the dataset, unregarding of distances. This makes a lot of sense for our data type. we end up with a map of 400 datapoints which represent the binding code in the dataset.

In [860]:
#Import the library
size = 20
#Build a network 20x20 with a weights format taken from the raw_data and activate Periodic Boundary Conditions. 
net = sps.somNet(size,size, scaled_data, PBC=True)

#Train the network for 10000 epochs and with initial learning rate of 0.01. 
net.train(0.01, 10000)

#Save the weights to file
net.save('../results/'+project+'/'+version+'_'+merging_version+'_'+window+'_cobinding_SOMweights_'+str(size))
Periodic Boundary Conditions active.
The weights will be initialised randomly.
Training SOM... done!
In [726]:
scaled_data
Out[726]:
array([[0.        , 0.        , 0.        , ..., 0.        , 0.06987315,
        0.02900529],
       [0.        , 0.        , 0.        , ..., 0.        , 0.05722184,
        0.03514311],
       [0.        , 0.        , 0.        , ..., 0.        , 0.04549254,
        0.0265561 ],
       ...,
       [0.02678915, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.03260858, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.03036455, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])
In [712]:
net = sps.somNet(0,0, scaled_data, loadFile='../results/'+project+'/'+version+'_'+merging_version+'_'+window+'_cobinding_SOMweights_'+str(size), PBC=True)
Periodic Boundary Conditions active.
The weights will be loaded from file.
In [862]:
pathplot = "../results/"+project+'/plots/'+version+'_'+merging_version+'_'+window+"_somNets/"
! mkdir $pathplot

SOMnode enrichment

displays a map of each node, respecting 2D neighboorhood. and its enrichment for a specific protein/mark.

In [863]:
#Print a map of the network nodes and colour them according to the first feature (column number 0) of the dataset
#and then according to the distance between each node and its neighbours.
for col in range(scaled_data.shape[1]):
    net.nodes_graph(colnum=col, printout=True, show=True, path="../results/"+project+'/plots/'+version+'_'+merging_version+'_'+window+"_somNets/", colname=data.columns[col])
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>

average differential enrichment

displays similarity in what the nodes have learnt. how different the sum of their weight is compared to their neighboors.

In [866]:
diffs = net.diff_graph(show=False, returns=True)
plt.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_cobinding_SOM'+str(size)+'.pdf')

interactive somPlot

displays the same differential map as above. but allows the user to look at the key TF/marks that are enriched in the node (might be more than one) this is very sensitive on the filter used to defined what is enriched and what is not.

In [880]:
plot.SOMPlot(net, size=size, colnames=merged.columns[cols:annot], folder='../results/' + project + '/plots/' + version + '_' + merging_version + '_'+window+"_onbool_", minweight=0.5)
/home/jeremie/.local/lib/python3.8/site-packages/bokeh/io/saving.py:125: UserWarning: save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN
  warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
/home/jeremie/.local/lib/python3.8/site-packages/bokeh/io/saving.py:138: UserWarning: save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'
  warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")

enrichment of SOMnodes

Which node is mostly enriched in which specific signal? the color represent the amount of points mostly recognized that node. the more points, the stronger the enrichment.

Here we have which node recognize signal from super enhancers

In [973]:
prj=np.array(net.project(scaled_data[merged['super_enhancer']!=0]))
<Figure size 432x288 with 0 Axes>
In [974]:
plot.bigScatter(prj,binsize=0.5,showpoint=False,precomputed=False, logscale=True, title='density plot of enhancers in TF cobinding space with TSNE', folder='../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+"_")
/home/jeremie/.local/lib/python3.8/site-packages/bokeh/io/saving.py:125: UserWarning: save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN
  warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
/home/jeremie/.local/lib/python3.8/site-packages/bokeh/io/saving.py:138: UserWarning: save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'
  warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")
Out[974]:
Figure(
id = '23869', …)
In [872]:
#Cluster the datapoints according to the Quality Threshold algorithm.
clusts = net.cluster(scaled_data, type='KMeans',numcl=n_clust[n])
sorting  = []
for clust in clusts:
    sorting.extend(clust)
sorting = np.array(sorting)
Warning: Only Quality Threshold and Density Peak clustering work with PBC
<Figure size 432x288 with 0 Axes>
In [873]:
viridis = cm.get_cmap('gist_rainbow', len(clusts))
colors=[]
for i, clust in enumerate(clusts):
    colors.extend([viridis(i)]*len(clust))
colors = np.array(colors)
viridis = cm.get_cmap('viridis', 256)
data = merged[merged.columns[annot:]]
for val in data.columns:
    data[val] = np.log2(1+data[val])
data = (data-data.min())/(data.max()-data.min())
rand = np.random.choice(range(len(sorting)),5000)
rand.sort()
data = data.iloc[sorting[rand]]
for val in data.columns:
    a = [viridis(int(v*256)) for v in data[val]]
    data[val] = a
data["clusters"]  = [tuple(i) for i in colors[rand]]
<ipython-input-873-3d497b534963>:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[val] = np.log2(1+data[val])
In [874]:
#sorted clustermap of cobindings clustered with SOM
fig = sns.clustermap(np.log2(1.01+merged[merged.columns[cols:annot]].iloc[sorting[rand]].T), vmin=0,vmax=3,figsize=(20,15),z_score=0, colors_ratio=0.01, col_cluster=False,col_colors=data, xticklabels=False)
fig.ax_col_dendrogram.set_visible(False)
fig.fig.suptitle("sorted clustermap of cobindings clustered with SOM")
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_SOM_'+str(n_clust)+'_onbool__clustermap_cobinding.pdf')
plt.show()
In [877]:
#let's look at the enrichment over SOM clusters
groups = []
for i, val in enumerate(clusts):
    groups.extend([i]*len(val))
enr, pvals = chip.enrichment(merged, groups = np.array(groups))
In [878]:
fig = sns.clustermap(enr.T,figsize=(12,12), vmax=4, vmin=-4, cmap='RdBu_r')
fig.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_kmeans_'+str(n_clust[n])+'_onbool_enrichment_on_SOMcluster_metasubset.pdf')
plt.show()

Plot TSNE density map

In [809]:
rand = np.random.choice(merged.index,30000)
In [817]:
red_data = TSNE(2,600,verbose=10,n_iter=1500).fit_transform(scaled_data[rand])
np.save('../results/'+project+'/'+version+'_'+merging_version+'_'+window+'red_data.npy',red_data)
[t-SNE] Computing 1801 nearest neighbors...
[t-SNE] Indexed 30000 samples in 0.636s...
[t-SNE] Computed neighbors for 30000 samples in 74.360s...
[t-SNE] Computed conditional probabilities for sample 1000 / 30000
[t-SNE] Computed conditional probabilities for sample 2000 / 30000
[t-SNE] Computed conditional probabilities for sample 3000 / 30000
[t-SNE] Computed conditional probabilities for sample 4000 / 30000
[t-SNE] Computed conditional probabilities for sample 5000 / 30000
[t-SNE] Computed conditional probabilities for sample 6000 / 30000
[t-SNE] Computed conditional probabilities for sample 7000 / 30000
[t-SNE] Computed conditional probabilities for sample 8000 / 30000
[t-SNE] Computed conditional probabilities for sample 9000 / 30000
[t-SNE] Computed conditional probabilities for sample 10000 / 30000
[t-SNE] Computed conditional probabilities for sample 11000 / 30000
[t-SNE] Computed conditional probabilities for sample 12000 / 30000
[t-SNE] Computed conditional probabilities for sample 13000 / 30000
[t-SNE] Computed conditional probabilities for sample 14000 / 30000
[t-SNE] Computed conditional probabilities for sample 15000 / 30000
[t-SNE] Computed conditional probabilities for sample 16000 / 30000
[t-SNE] Computed conditional probabilities for sample 17000 / 30000
[t-SNE] Computed conditional probabilities for sample 18000 / 30000
[t-SNE] Computed conditional probabilities for sample 19000 / 30000
[t-SNE] Computed conditional probabilities for sample 20000 / 30000
[t-SNE] Computed conditional probabilities for sample 21000 / 30000
[t-SNE] Computed conditional probabilities for sample 22000 / 30000
[t-SNE] Computed conditional probabilities for sample 23000 / 30000
[t-SNE] Computed conditional probabilities for sample 24000 / 30000
[t-SNE] Computed conditional probabilities for sample 25000 / 30000
[t-SNE] Computed conditional probabilities for sample 26000 / 30000
[t-SNE] Computed conditional probabilities for sample 27000 / 30000
[t-SNE] Computed conditional probabilities for sample 28000 / 30000
[t-SNE] Computed conditional probabilities for sample 29000 / 30000
[t-SNE] Computed conditional probabilities for sample 30000 / 30000
[t-SNE] Mean sigma: 0.037402
[t-SNE] Computed conditional probabilities in 32.301s
[t-SNE] Iteration 50: error = 72.7553177, gradient norm = 0.0000002 (50 iterations in 26.451s)
[t-SNE] Iteration 100: error = 71.8867798, gradient norm = 0.0018707 (50 iterations in 27.128s)
[t-SNE] Iteration 150: error = 70.7329178, gradient norm = 0.0000012 (50 iterations in 33.320s)
[t-SNE] Iteration 200: error = 70.7330627, gradient norm = 0.0000013 (50 iterations in 37.452s)
[t-SNE] Iteration 250: error = 70.7330551, gradient norm = 0.0000014 (50 iterations in 36.432s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 70.733055
[t-SNE] Iteration 300: error = 2.5959568, gradient norm = 0.0008672 (50 iterations in 39.152s)
[t-SNE] Iteration 350: error = 1.9617102, gradient norm = 0.0008384 (50 iterations in 40.533s)
[t-SNE] Iteration 400: error = 1.7067482, gradient norm = 0.0004231 (50 iterations in 62.360s)
[t-SNE] Iteration 450: error = 1.5842191, gradient norm = 0.0002481 (50 iterations in 64.132s)
[t-SNE] Iteration 500: error = 1.5161682, gradient norm = 0.0001678 (50 iterations in 62.008s)
[t-SNE] Iteration 550: error = 1.4740204, gradient norm = 0.0001238 (50 iterations in 62.388s)
[t-SNE] Iteration 600: error = 1.4448760, gradient norm = 0.0000934 (50 iterations in 62.468s)
[t-SNE] Iteration 650: error = 1.4244875, gradient norm = 0.0000757 (50 iterations in 62.524s)
[t-SNE] Iteration 700: error = 1.4086123, gradient norm = 0.0000645 (50 iterations in 63.268s)
[t-SNE] Iteration 750: error = 1.3962936, gradient norm = 0.0000565 (50 iterations in 66.228s)
[t-SNE] Iteration 900: error = 1.3736291, gradient norm = 0.0000342 (50 iterations in 64.960s)
[t-SNE] Iteration 950: error = 1.3692304, gradient norm = 0.0000281 (50 iterations in 68.919s)
[t-SNE] Iteration 1000: error = 1.3655323, gradient norm = 0.0000243 (50 iterations in 70.773s)
[t-SNE] Iteration 1050: error = 1.3625841, gradient norm = 0.0000224 (50 iterations in 72.901s)
[t-SNE] Iteration 1100: error = 1.3601542, gradient norm = 0.0000226 (50 iterations in 71.137s)
[t-SNE] Iteration 1150: error = 1.3581504, gradient norm = 0.0000167 (50 iterations in 77.071s)
[t-SNE] Iteration 1200: error = 1.3565203, gradient norm = 0.0000152 (50 iterations in 76.978s)
[t-SNE] Iteration 1250: error = 1.3551711, gradient norm = 0.0000141 (50 iterations in 77.427s)
[t-SNE] Iteration 1300: error = 1.3539379, gradient norm = 0.0000128 (50 iterations in 73.403s)
[t-SNE] Iteration 1350: error = 1.3528296, gradient norm = 0.0000124 (50 iterations in 64.483s)
[t-SNE] Iteration 1400: error = 1.3518716, gradient norm = 0.0000110 (50 iterations in 65.703s)
[t-SNE] Iteration 1450: error = 1.3510026, gradient norm = 0.0000111 (50 iterations in 64.661s)
[t-SNE] Iteration 1500: error = 1.3502915, gradient norm = 0.0000093 (50 iterations in 67.108s)
[t-SNE] KL divergence after 1500 iterations: 1.350291
In [815]:
# Let's look at the data. the density map of the distribution of all pseudo enhancer over their tf binding profile
_, ax = plt.subplots(figsize=(10,10))
fig = sns.kdeplot(red_data[:,0], red_data[:,1], shade=True, ax=ax)
plt.savefig('../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+'_density_TSNE.pdf')

A density plot where each point represent 2Dimensional hexagonal bin and the intensity represents the accumulaation of datapoints in these bins. The datapoints represent a random distribution of ~10% of all peaks of the cobinding matrix. They dimensionality reduced using TSNE over the scaled signal of bound TF on these peaks.

These islands and clusters represent categories / types of enhancers defined by their binding code.

In [816]:
# now let's look at it from anothe rploting method. we can see many many islands
plot.bigScatter(red_data,binsize=0.4,showpoint=False,precomputed=False, logscale=True, title='density plot of enhancers in TF cobinding space with TSNE', folder='../results/'+project+'/plots/'+version+'_'+merging_version+'_'+window+"_")
/home/jeremie/.local/lib/python3.8/site-packages/bokeh/io/saving.py:125: UserWarning: save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN
  warn("save() called but no resources were supplied and output_file(...) was never called, defaulting to resources.CDN")
/home/jeremie/.local/lib/python3.8/site-packages/bokeh/io/saving.py:138: UserWarning: save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'
  warn("save() called but no title was supplied and output_file(...) was never called, using default title 'Bokeh Plot'")
Out[816]:
Figure(
id = '14093', …)
In [ ]:
## TODO: color by feature

## TODO: color by dependencies/pathways/expression.. after ABC model,